07_Xenopus_RNAseq

Published

December 2, 2023

1 Overview Xenopus Laevis RNA-seq time-series

We downloaded and analysed the data from ‘Constrained vertebrate evolution by pleiotropic genes’ and ‘Genome evolution in the allotetraploid frog Xenopus laevis’. When combining the two time series, we have a total of 32 distinct time points that perfectly overlap with developmental stages present in our TMTcPro proteome. The processing pipeline is the same employed in Fig2.3_frog_RNAseq and Fig4.2_frog_RNAseq_merged_with_Expande

1.0.0.1 Loading libraries

1.0.0.2 Functions and misc

rm(list=ls(all=TRUE))

stages_frog_match <- c("unfertilised","1-cell","16-cell","110-cell","late neurula","mid-tailbud","late-tailbud","larva")
stages_frog_merged = c("oo12","oo34","oo56","egg","st02","st05","st08","st09","st10","st11","st12","st13","st15","st17","st19","st20","st21","st23","st25","st26","st28","st30","st31","st35","st37-38","st40","st43","st48","st61","st66")

stages_frog_order_all = c("oo12","oo34","oo56","egg","egg","st02","st02","st05","st05","st08","st08","st08","st09","st09","st09","st09","st10","st10","st10","st11","st11","st12","st12","st13","st13","st15","st15","st17","st17","st19","st19","st20","st20","st21","st21","st23","st23","st25","st25","st26","st26","st28","st28","st30","st30","st31","st31","st35","st35","st35","st37-38","st37-38","st40","st40","st43","st43","st48","st48","st61","st61","st66","st66")

theme_personalised <- theme_pubr(base_family = "Avenir") +
                      theme(panel.background=element_blank(),
                            panel.border=element_blank(),
                            panel.grid.major=element_blank(),
                            panel.grid.minor=element_blank(),
                            plot.background=element_blank())

heatmap_function <- function(matrix,title, legend) {
  # Calculate min & max
  min_value <- min(matrix)
  max_value<- max(matrix)
  middle_value <- (min_value + max_value) / 2
  
  # color scale
  color_scale <- colorRamp2(c(min_value, middle_value, max_value), c(
    "#A7D3D4FF", "#F1F1F1FF", "#E4C1D9FF"
    #"#DC876CFF",  "#FEFEFEFF", "#ADD0B5FF"
    ))

heatmap <- Heatmap(
    matrix,
    name = "Correlation",
    col = color_scale,
    show_row_names = TRUE,
    show_column_names = TRUE,
    cluster_rows = FALSE,
    cluster_columns = FALSE,
    column_dend_height = 50,
    column_title = title,
    row_title = "",
    row_names_side = c("left"),
    column_title_gp = gpar(fontsize = 16),
    row_title_gp = gpar(fontsize = 16),
    column_names_rot = 45,
    row_names_rot = 0,
    cell_fun = function(j, i, x, y, width, height, fill) {
      grid.text(x = x, y = y, label = round(matrix[i, j], 2), gp = gpar(fontsize = 10))
    }, 
    heatmap_legend_param = list(title = paste(legend," correlation"), direction = "horizontal"))
  
  return(heatmap)
  # draw(heatmap,heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
}

mean_by_repStage.fn <-
  function(dataset, column_order) {
    result <- dataset %>%
      pivot_longer(-gene_id, names_to = "stage") %>%
      separate(stage, into = c("stage", "rep"), sep = "_") %>%
      group_by(gene_id, stage) %>%
      summarise(mean = mean(value)) %>% # Calculate the row means for each stage by gene
      pivot_wider(names_from = stage, values_from = mean) %>% # Pivot the result to wide format
      dplyr::select(gene_id, all_of(column_order)) %>%
      ungroup()

    return(result)
  }

1.1 Preparing tx2gene for Xenopus

Two different times series were download.

Downlaod the XENLA_10.1_GCF_XBmodels.gff3 annotation which combine Xenabase and NCBI-Xenbase gene models. The 3 column has multiple “non-standard” entries, we grep all of them to prepare the transcript to gene annotation files: guide_RNA,lnc_RNA,,mRNA,rRNA,scRNA,snRNA,snoRNA,tRNA,telomerase_RNA,C_gene_segment,D_loop,V_gene_segment,origin_of_replication,pseudogene,transcript

###
# gff3 view
# ID=XBmRNA50369;Parent=XBXL10_1g26894;gene=LOC108718813
###

# selecting for only mRNA does not work with this annotation => losing genes with salmon gene/trasncript quantification
# cat XENLA_10.1_GCF_XBmodels.gff3 | awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="mRNA"){split($9, a, ";"); gsub(/^ID=/, "", a[1]); gsub(/^Parent=/, "", a[2]); gsub(/^gene=|^exception=/, "", a[6]); print a[1],a[2],a[6]}}' > mRNA2gene_XenLa10.1_GCFXBmodels.txt

cat XENLA_10.1_GCF_XBmodels.gff3 | \
awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="guide_RNA" || $3=="lnc_RNA" || $3=="mRNA" || $3=="rRNA" || $3=="scRNA" || $3=="snRNA" || $3=="snoRNA" || $3=="tRNA" || $3=="telomerase_RNA" || $3=="C_gene_segment" || $3=="D_loop" || $3=="V_gene_segment" || $3=="origin_of_replication" || $3=="pseudogene" || $3=="transcript"){split($9, a, ";"); \gsub(/^ID=/, "", a[1]); gsub(/^Parent=/, "", a[2]); gsub(/^gene=|^exception=/, "", a[6]); print a[1],a[2],a[6]}}' > tx2gene_XenLa10.1_GCFXBmodels.txt

# converting gff3 to GTF, re do the annotation step and comapre the results
agat_convert_sp_gff2gtf.pl -gff XENLA_10.1_GCF_XBmodels.gff3 -o XENLA_10.1_GCF_XBmodels_agat.gtf

cat XENLA_10.1_GCF_XBmodels_agat.gtf |  
awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="transcript"){split($9, a, ";"); print a[1],a[4],a[2]}}' | 
sed 's/gene_id "//g;s/"\t ID "/\t/g;s/"\t transcript_id "/\t/g;s/"//' > tx2gene_XenLa10.1_GCFXBmodels_agat.txt

### XENLA_10.1_GCF_XBmodels.gtf | CDS  ==> NCBI-Xenbase gene models (gtf) => not good annotation
# if($3=="exon")
# if($3=="CDS")
# cat XENLA_10.1_GCF_XBmodels.gtf |  
# awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="CDS"){split($9, a, ";"); print a[1],a[2],a[3],a[4]}}' | 
# sed 's/gene_id "//g;s/"\t gene_name "/\t/g;s/"\t transcript_id "/\t/g;s/"\t transcript_name "//g;s/"//' > XENLA_10.1_GCF_XBmodels.gtf_cds_t2tgene.txt

You can either use tximeta or tximport packages to import Salmon transcript quantification data and summarize it at the gene level.

1.2 Loading quantification data

###############################
#### Salmon quantification files
###############################

# point to Salmon quantification file
dir_frog <- "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/b.raw_data/transcriptome/xenopus_laevis/"

gff <- "salmon_gffread_gff/"
# gtf <- "salmon_gffread_gtf/"

# List all directories containing data  
names  <- list.files(dir_frog)

###############################
#### Time series genome evolution
###############################
# read in your study design/sampleTable
samples_ts_genevo <- read_excel(
    path = "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/RNAseq_metadata_frog.xlsx",
    col_names = T,
    trim_ws = T,
    sheet = "time_series_1_genevo",  
    range = cell_cols("A:G")) |> 
  clean_names()
samples_ts_genevo <- samples_ts_genevo[1:27,] |> as_tibble()

## Obtain a vector of all filenames including the path
files_genevo <- file.path(dir_frog, paste0(gff,samples_ts_genevo$library_name,"/quant.sf"))
names(files_genevo) <-  samples_ts_genevo$dev_stage_name # Rename the files using the new names
print(names(files_genevo)) # Print the new file names
 [1] "oo12_1" "oo34_1" "oo56_1" "egg_1"  "egg_2"  "st08_1" "st08_2" "st08_3" "st09_1" "st09_2" "st10_1" "st10_2" "st10_3" "st12_1" "st12_2" "st15_1" "st15_2" "st20_1" "st20_2" "st25_1" "st25_2" "st30_1" "st30_2" "st35_1" "st35_2" "st35_3" "st40_1"
all(file.exists(files_genevo))
[1] TRUE
###############################
#### Time series EXPANDE
###############################

# read in your study design/sampleTable
samples_ts_expande <- read_excel(
    path = "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/RNAseq_metadata_frog.xlsx",
    col_names = T,
    trim_ws = T,
    sheet = "time_series_2_expande",  
    range = cell_cols("A:H")) |> 
  clean_names()

## Obtain a vector of all filenames including the path
files_expande <- file.path(dir_frog, paste0(gff,samples_ts_expande$library_name,"/quant.sf"))
samples_expande <- samples_ts_expande$dev_stage_name # Rename the files using the new names
names(files_expande) <- samples_expande
print(names(files_expande)) # Print the new file names
 [1] "st02_1"    "st02_2"    "st05_1"    "st05_2"    "st09_1"    "st09_2"    "st11_1"    "st11_2"    "st13_1"    "st13_2"    "st17_1"    "st17_2"    "st19_1"    "st19_2"    "st21_1"    "st21_2"    "st23_1"    "st23_2"    "st26_1"    "st26_2"    "st28_1"    "st28_2"    "st31_1"    "st31_2"    "st37-38_1" "st37-38_2" "st43_1"    "st43_2"    "st48_1"    "st48_2"    "st61_1"    "st61_2"    "st66_1"    "st66_2"   
all(file.exists(files_expande))
[1] TRUE
###############################
#### Time series merged
###############################

# read in your study design/sampleTable
samples_ts_merged <- read_excel(
    path = "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/RNAseq_metadata_frog.xlsx",
    col_names = T,
    trim_ws = T,
    sheet = "time_series_merged",  
    range = cell_cols("A:J")) |> 
  clean_names()

samples_ts_merged <- samples_ts_merged[1:61,] |> as_tibble()

## Obtain a vector of all filenames including the path
files_merged <- file.path(dir_frog, paste0(gff,samples_ts_merged$library_name,"/quant.sf"))
samples_merge_name <- samples_ts_merged$dev_stage_name # Rename the files using the new names
names(files_merged) <- samples_merge_name
print(names(files_merged)) # Print the new file names
 [1] "oo12_1"    "oo34_1"    "oo56_1"    "egg_1"     "egg_2"     "st02_1"    "st02_2"    "st05_1"    "st05_2"    "st08_1"    "st08_2"    "st08_3"    "st09_1"    "st09_2"    "st09_3"    "st09_4"    "st10_1"    "st10_2"    "st10_3"    "st11_1"    "st11_2"    "st12_1"    "st12_2"    "st13_1"    "st13_2"    "st15_1"    "st15_2"    "st17_1"    "st17_2"    "st19_1"    "st19_2"    "st20_1"    "st20_2"    "st21_1"    "st21_2"    "st23_1"    "st23_2"    "st25_1"    "st25_2"    "st26_1"    "st26_2"    "st28_1"    "st28_2"    "st30_1"    "st30_2"    "st31_1"    "st31_2"    "st35_1"    "st35_2"    "st35_3"    "st37-38_1" "st37-38_2" "st40_1"    "st43_1"    "st43_2"    "st48_1"    "st48_2"    "st61_1"    "st61_2"    "st66_1"    "st66_2"   
all(file.exists(files_merged))
[1] TRUE
###############################
#### Coldata
###############################

# create metadata df
coldata_ts_gene_evo <- data.frame(files=files_genevo, samples_ts_genevo, stringsAsFactors=FALSE) %>%  clean_names()
coldata_ts_expande <- data.frame(files=files_expande, samples_ts_expande, stringsAsFactors=FALSE)  %>%  clean_names()
coldata_ts_merged <- data.frame(files=files_merged, samples_ts_merged, stringsAsFactors=FALSE)  %>%  clean_names()

1.3 tximport

Imports transcript-level abundance, estimated counts and transcript lengths, and summarizes into matrices for use with downstream gene-level analysis packages. Soneson C, Love MI, Robinson MD (2015). “Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences.” F1000Research, 4. doi: 10.12688/f1000research.7563.1. We create annotations using makeTxDbFromGFF and match genes-transcripts IDs

###############################
#### tximport
###############################
tx2gene_gff_frog <- read_delim("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/frog_XenLa10.1/XENLA_10.1_GCF_XBmodels_tx2tgene.txt")

txi_ts1 <- tximport(files_genevo,
                            type = "salmon",
                            tx2gene = tx2gene_gff_frog,
                            ignoreTxVersion = T,
                            countsFromAbundance = "lengthScaledTPM")

txi_ts2 <- tximport(files_expande, 
                            type = "salmon", 
                            tx2gene = tx2gene_gff_frog, #tx2gene[,c("tx_id", "ensgene")]
                            ignoreTxVersion = T,
                            countsFromAbundance = "lengthScaledTPM")


txi_both <- tximport(files_merged,
                            type = "salmon",
                            tx2gene = tx2gene_gff_frog, #tx2gene[,c("tx_id", "ensgene")]
                            ignoreTxVersion = T,
                            countsFromAbundance = "lengthScaledTPM")

# if you exported transcript level data and want to append your gene symbols to the data frame
# Txi_trans <- as_tibble(Txi_trans$counts, rownames = "target_id")
# Txi_trans <- left_join(Txi_trans, Tx)

# txi has the matrices of the abundance, counts and length
names(txi_both)
[1] "abundance"           "counts"              "length"              "countsFromAbundance"
### Check that sample names match in both files
all(colnames(txi_both$counts) %in% coldata_ts_merged$dev_stage_name)
[1] TRUE

1.3.1 TPM ts1

###############################
#### TPM
###############################

TPM_txi_ts1.df <-
  as_tibble(txi_ts1$abundance, rownames = NA) %>%
  rownames_to_column("gene_id")

# save normalised and gene-level estimated TPM
write.table(TPM_txi_ts1.df, file = "frog_ts1_rna_tpm.tsv", sep = "\t", quote = F,row.names=FALSE, col.names = T)


# 42,042 to 30008
keepTPM <- rowSums(TPM_txi_ts1.df[,-1]) >= 2 # 2 TPM per gene
table(keepTPM)
keepTPM
FALSE  TRUE 
12074 29968 
TPM_txi_ts1.df <- TPM_txi_ts1.df[keepTPM,]

write.table(TPM_txi_ts1.df, file="frog_ts1_rna_tpmFilt.tsv", sep="\t", quote=F, row.names = F,col.names=T)

###############################
#### DESeq2
###############################
stages_ts1 <- c("oo12", "oo34", "oo56", "egg", "st08", "st09", "st10", "st12", "st15", "st20", "st25", "st30", "st35", "st40")

coldata_ts_gene_evo$condition <- coldata_ts_gene_evo$short_stage
group <- factor(coldata_ts_gene_evo$condition,levels = stages_ts1) 
coldata_ts_gene_evo$condition <- group

dds_ts1 <- DESeqDataSetFromTximport(txi_ts1, 
                                    colData = coldata_ts_gene_evo, 
                                    design = ~ condition)

dds_ts1 <- estimateSizeFactors(dds_ts1)
sizeFactors(dds_ts1)
    oo12_1     oo34_1     oo56_1      egg_1      egg_2     st08_1     st08_2     st08_3     st09_1     st09_2     st10_1     st10_2     st10_3     st12_1     st12_2     st15_1     st15_2     st20_1     st20_2     st25_1     st25_2     st30_1     st30_2     st35_1     st35_2     st35_3     st40_1 
2.03767395 1.56042078 2.12303127 0.48975437 0.62495824 1.55443948 1.21326408 6.54368122 1.59223618 1.68654819 1.13433200 0.94384874 3.90257506 0.71419557 0.50706537 0.40672291 0.53228633 0.88621026 0.65633804 1.28366672 0.65729847 1.46426451 1.00628501 1.14867224 0.03189453 2.71637337 1.15856811 
DESeq_ts1_frog<- counts(dds_ts1, normalized=TRUE) %>% as_tibble(., rownames ="gene_id") # normalised counts

# save normalised and gene-level estimated DESeq2
write.table(DESeq_ts1_frog, file="frog_ts1_DESeq2.tsv", sep="\t", quote=F, row.names = F, col.names = T)

# Filtering low abundance genes. We  keep genes that are expressed in at least 2 samples with >= 10 
keep_ts1 <- rowSums(assay(dds_ts1) >= 10) >= 2
dds_ts1 <- dds_ts1[keep_ts1,] #16246

dds_ts1 <- estimateSizeFactors(dds_ts1)
sizeFactors(dds_ts1)
    oo12_1     oo34_1     oo56_1      egg_1      egg_2     st08_1     st08_2     st08_3     st09_1     st09_2     st10_1     st10_2     st10_3     st12_1     st12_2     st15_1     st15_2     st20_1     st20_2     st25_1     st25_2     st30_1     st30_2     st35_1     st35_2     st35_3     st40_1 
2.03767395 1.56042078 2.12303127 0.48975437 0.62495824 1.55443948 1.21326408 6.54368122 1.59223618 1.68654819 1.13433200 0.94384874 3.90257506 0.71419557 0.50706537 0.40672291 0.53228633 0.88621026 0.65633804 1.28366672 0.65729847 1.46426451 1.00628501 1.14867224 0.03189453 2.71637337 1.15856811 
### Exporting normalized counts
normCounts_ts1 <- counts(dds_ts1, normalized=TRUE)

write.table(normCounts_ts1, file="normalisedDeseq_RNA_frog_ts1.tsv", sep="\t", quote=F, col.names=NA)

1.3.2 TPM ts2

###############################
#### TPM
###############################

TPM_txi_ts2.df <-
  as_tibble(txi_ts2$abundance, rownames = NA) %>%
  rownames_to_column("gene_id")

# save normalised and gene-level estimated TPM
write.table(TPM_txi_ts2.df, file = "frog_ts2_rna_tpm.tsv", sep = "\t", quote = F,row.names=FALSE, col.names = T)


# 42,042 to 30008
keepTPM <- rowSums(TPM_txi_ts2.df[,-1]) >= 2 # 2 TPM per gene
table(keepTPM)
keepTPM
FALSE  TRUE 
 6345 35697 
TPM_txi_ts2.df <- TPM_txi_ts2.df[keepTPM,]

write.table(TPM_txi_ts2.df, file="frog_ts2_rna_tpmFilt.tsv", sep="\t", quote=F, row.names = F,col.names=T)

###############################
#### DESeq2
###############################

unique_elements <- unique(coldata_ts_expande$short_stage)
unique_elements
 [1] "st02"    "st05"    "st09"    "st11"    "st13"    "st17"    "st19"    "st21"    "st23"    "st26"    "st28"    "st31"    "st37-38" "st43"    "st48"    "st61"    "st66"   
coldata_ts_expande$condition <- coldata_ts_expande$short_stage
group <- factor(coldata_ts_expande$condition,levels = unique_elements) 
coldata_ts_expande$condition <- group

dds_ts2 <- DESeqDataSetFromTximport(txi_ts2, 
                                    colData = coldata_ts_expande, 
                                    design = ~ condition)

dds_ts2 <- estimateSizeFactors(dds_ts2)
sizeFactors(dds_ts2)
   st02_1    st02_2    st05_1    st05_2    st09_1    st09_2    st11_1    st11_2    st13_1    st13_2    st17_1    st17_2    st19_1    st19_2    st21_1    st21_2    st23_1    st23_2    st26_1    st26_2    st28_1    st28_2    st31_1    st31_2 st37-38_1 st37-38_2    st43_1    st43_2    st48_1    st48_2    st61_1    st61_2    st66_1    st66_2 
0.7160142 0.6754361 0.8210946 1.0224680 1.2369617 1.2106088 0.9183008 0.8504560 0.8245654 0.8105487 1.1309582 1.0026527 1.1764919 1.1658180 0.8850037 0.9281650 0.9121465 0.8144108 1.3614312 1.5113700 1.3258554 1.6577136 1.5383287 1.2375204 1.2304822 1.3108729 0.8843944 1.2450906 1.0969132 0.8180797 1.1405791 0.9191203 0.8519913 0.6812380 
DESeq_ts2.df <- counts(dds_ts2, normalized=TRUE) %>% as_tibble(., rownames ="gene_id") # normalised counts

# save normalised and gene-level estimated DESeq2
write.table(DESeq_ts2.df, file="frog_ts2_DESeq2.tsv", sep="\t", quote=F, row.names = F, col.names = T)

# Filtering low abundance genes. We  keep genes that are expressed in at least 2 samples with >= 10 
keep_ts2 <- rowSums(assay(dds_ts2) >= 10) >= 2
dds_ts2 <- dds_ts2[keep_ts2,] #16246

dds_ts2 <- estimateSizeFactors(dds_ts2)
sizeFactors(dds_ts2)
   st02_1    st02_2    st05_1    st05_2    st09_1    st09_2    st11_1    st11_2    st13_1    st13_2    st17_1    st17_2    st19_1    st19_2    st21_1    st21_2    st23_1    st23_2    st26_1    st26_2    st28_1    st28_2    st31_1    st31_2 st37-38_1 st37-38_2    st43_1    st43_2    st48_1    st48_2    st61_1    st61_2    st66_1    st66_2 
0.7159855 0.6753248 0.8210875 1.0221377 1.2369361 1.2109413 0.9182624 0.8505945 0.8245141 0.8105327 1.1310381 1.0028078 1.1766958 1.1658633 0.8850037 0.9281650 0.9121465 0.8143423 1.3614372 1.5113700 1.3257545 1.6577136 1.5383612 1.2375204 1.2304822 1.3111006 0.8843301 1.2450498 1.0963127 0.8180797 1.1403499 0.9189030 0.8518917 0.6811925 
### Exporting normalized counts
normCounts_dds_ts2<- counts(dds_ts2, normalized=TRUE)

write.table(normCounts_dds_ts2, file="normalisedDeseq_RNA_frog_ts2.tsv", sep="\t", quote=F, col.names=NA)

1.3.3 TPM ts3

###############################
#### TPM
###############################

TPM_txi_both.df <-
  as_tibble(txi_both$abundance, rownames = NA) %>%
  rownames_to_column("gene_id")

# save normalised and gene-level estimated TPM
write.table(TPM_txi_both.df, file = "frog_both_rna_tpm.tsv", sep = "\t", quote = F,row.names=FALSE, col.names = T)


# 42,042 to 30008
keepTPM <- rowSums(TPM_txi_both.df[,-1]) >= 2 # 2 TPM per gene
table(keepTPM)
keepTPM
FALSE  TRUE 
 5479 36563 
TPM_txi_both.df <- TPM_txi_both.df[keepTPM,]

write.table(TPM_txi_both.df, file="frog_both_rna_tpmFilt.tsv", sep="\t", quote=F, row.names = F,col.names=T)

###############################
#### DESeq2
###############################
uniqueBoth = unique(coldata_ts_merged$short_stage)
coldata_ts_merged$condition <- coldata_ts_merged$short_stage
group <- factor(coldata_ts_merged$condition,levels = uniqueBoth) 
coldata_ts_merged$condition <- group

dds_both <- DESeqDataSetFromTximport(txi_both, 
                                     colData = coldata_ts_merged, 
                                     design = ~ time_series + condition)

dds_both <- estimateSizeFactors(dds_both)
sizeFactors(dds_both)
    oo12_1     oo34_1     oo56_1      egg_1      egg_2     st02_1     st02_2     st05_1     st05_2     st08_1     st08_2     st08_3     st09_1     st09_2     st09_3     st09_4     st10_1     st10_2     st10_3     st11_1     st11_2     st12_1     st12_2     st13_1     st13_2     st15_1     st15_2     st17_1     st17_2     st19_1     st19_2     st20_1     st20_2     st21_1     st21_2     st23_1     st23_2     st25_1     st25_2     st26_1     st26_2     st28_1     st28_2     st30_1     st30_2     st31_1     st31_2     st35_1     st35_2     st35_3  st37-38_1  st37-38_2     st40_1     st43_1     st43_2     st48_1     st48_2     st61_1     st61_2     st66_1     st66_2 
1.55983814 1.25744306 1.65575739 0.40425895 0.47753355 0.93642574 0.88024098 1.04622495 1.30566213 1.31951216 0.99987136 5.32693762 1.33494565 1.39923222 1.53784059 1.49649321 0.94601462 0.76096445 3.11013853 1.14981243 1.06919617 0.58307748 0.39726936 1.03801594 1.02326132 0.32992465 0.41911703 1.40240895 1.24551166 1.44692524 1.42277249 0.71428681 0.50505884 1.07214252 1.13330232 1.10712159 0.99112590 1.04252031 0.50845900 1.63045030 1.82532445 1.59874232 2.01016623 1.19181502 0.78262200 1.85067916 1.49762976 0.94683628 0.02519423 2.13975449 1.44914416 1.55215301 0.94978848 1.01689825 1.42951258 1.19252635 0.88846275 1.24673010 1.00620109 0.92157448 0.73449299 
DESeq_both.df <- counts(dds_both, normalized=TRUE) %>% as_tibble(., rownames ="gene_id") # normalised counts

# save normalised and gene-level estimated DESeq2
write.table(DESeq_both.df, file="frog_both_DESeq2.tsv", sep="\t", quote=F, row.names = F, col.names = T)

# Filtering low abundance genes. We  keep genes that are expressed in at least 2 samples with >= 10 
keep_both <- rowSums(assay(dds_both) >= 10) >= 2
dds_both <- dds_both[keep_both,] #16246

dds_both <- estimateSizeFactors(dds_both)
sizeFactors(dds_both)
    oo12_1     oo34_1     oo56_1      egg_1      egg_2     st02_1     st02_2     st05_1     st05_2     st08_1     st08_2     st08_3     st09_1     st09_2     st09_3     st09_4     st10_1     st10_2     st10_3     st11_1     st11_2     st12_1     st12_2     st13_1     st13_2     st15_1     st15_2     st17_1     st17_2     st19_1     st19_2     st20_1     st20_2     st21_1     st21_2     st23_1     st23_2     st25_1     st25_2     st26_1     st26_2     st28_1     st28_2     st30_1     st30_2     st31_1     st31_2     st35_1     st35_2     st35_3  st37-38_1  st37-38_2     st40_1     st43_1     st43_2     st48_1     st48_2     st61_1     st61_2     st66_1     st66_2 
1.55983814 1.25744306 1.65575739 0.40425895 0.47753355 0.93642574 0.88024098 1.04622495 1.30566213 1.31951216 0.99987136 5.32693762 1.33494565 1.39923222 1.53784059 1.49649321 0.94601462 0.76096445 3.11013853 1.14981243 1.06919617 0.58307748 0.39726936 1.03801594 1.02326132 0.32992465 0.41911703 1.40240895 1.24551166 1.44692524 1.42277249 0.71428681 0.50505884 1.07214252 1.13330232 1.10712159 0.99112590 1.04252031 0.50845900 1.63045030 1.82532445 1.59874232 2.01016623 1.19181502 0.78262200 1.85067916 1.49762976 0.94683628 0.02519423 2.13975449 1.44914416 1.55215301 0.94978848 1.01689825 1.42951258 1.19252635 0.88846275 1.24673010 1.00620109 0.92157448 0.73449299 
### Exporting normalized counts
normCounts_both <- counts(dds_both, normalized=TRUE)

write.table(normCounts_both, file="normalisedDeseq_RNA_frog_both.tsv", sep="\t", quote=F, col.names=NA)

1.3.4 PCA

###############################
#### Transform counts for data visualization
###############################

# DESeq2: Variance Stabilising Transformation - log transformed values
rld_ts1 <- vst(dds_ts1, blind=TRUE) #rlog
rld_ts2 <- vst(dds_ts2, blind=TRUE) #rlog
rld_both <- vst(dds_both, blind=TRUE) #rlog

###############################
#### PCA ts1
###############################

rldTs1_mat <- assay(rld_ts1) 
pca.res_ts1 <- prcomp(t(rldTs1_mat),center = T)


pc.var_ts1<-pca.res_ts1$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per_ts1<-round(pc.var_ts1/sum(pc.var_ts1)*100, 1)
pca.res_ts1.df <- as_tibble(pca.res_ts1$x) |> 
  mutate(group= coldata_ts_gene_evo |> pull(condition)) |>
  dplyr::select(group, everything())

coldata_ts_gene_evo <- coldata_ts_gene_evo |> mutate(dev_stage_name = paste(dev_stage_name, replicate, sep = "_"))

labels =coldata_ts_gene_evo %>% pull(dev_stage_name)
color=coldata_ts_gene_evo %>% pull(condition) 
pca.plot_ts1 <- ggplot(pca.res_ts1) + 
  aes(x=PC1, y=PC2, 
      label= labels, 
      color = color) +
  geom_point(size=4) +
  scale_color_viridis_d(option = "C") +
  # scale_color_manual(values = ciona_stages_palette) +
  xlab(paste0("PC1 (",pc.per_ts1[1],"%",")")) +
  ylab(paste0("PC2 (",pc.per_ts1[2],"%",")")) +
  labs(color = "Users By labs")+
  coord_fixed() +
  labs(color = "hpf") + 
  theme_personalised +
  theme(legend.position = c(0.5,0.1),
        legend.direction = "horizontal")

pca.plot_ts1 

ggplotly(pca.plot_ts1) # drop st40 -- it's an egg
cairo_pdf("frog_rna_ts1_pca.pdf", width=8.5, height=5.6)
print(pca.plot_ts1)
invisible(dev.off())

###############################
#### PCA ts2
###############################

rldts2_mat <- assay(rld_ts2) 
pca.res_ts2 <- prcomp(t(rldts2_mat),center = T)


pc.var_ts2<-pca.res_ts2$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per_ts2<-round(pc.var_ts2/sum(pc.var_ts2)*100, 1)
pca.res_ts2.df <- as_tibble(pca.res_ts2$x) |> 
  mutate(group= coldata_ts_expande |> pull(condition)) |>
  dplyr::select(group, everything())

coldata_ts_expande <- coldata_ts_expande |>    mutate(dev_stage_name = paste(dev_stage_name, replicate, sep = "_"))

labels =coldata_ts_expande %>% pull(dev_stage_name)
color=coldata_ts_expande %>% pull(condition) #time
pca.plot_ts2 <- ggplot(pca.res_ts2) + 
  aes(x=PC1, y=PC2, 
      label= labels, 
      color = color) +
  geom_point(size=4) +
  scale_color_viridis_d(option = "C") +
  # scale_color_manual(values = ciona_stages_palette) +
  xlab(paste0("PC1 (",pc.per_ts2[1],"%",")")) +
  ylab(paste0("PC2 (",pc.per_ts2[2],"%",")")) +
  labs(color = "Users By labs")+
  coord_fixed() +
  labs(color = "hpf") + 
  theme_personalised +
  theme(legend.position = c(0.5,0.1),
        legend.direction = "horizontal")

pca.plot_ts2 

ggplotly(pca.plot_ts2) # drop st40 -- it's an egg
cairo_pdf("frog_rna_ts2_pca.pdf", width=8.5, height=5.6)
print(pca.plot_ts2)
invisible(dev.off())

###############################
#### PCA both
###############################

rldboth_mat <- assay(rld_both) #[, !grepl("^adult", colnames(assay(rld)))]
pca.res_both <- prcomp(t(rldboth_mat),center = T)


pc.var_both<-pca.res_both$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per_both<-round(pc.var_both/sum(pc.var_both)*100, 1)
pca.res_both.df <- as_tibble(pca.res_both$x) |> 
  mutate(group= coldata_ts_merged |> pull(condition)) |>
  dplyr::select(group, everything())

coldata_ts_merged <- coldata_ts_merged |>    mutate(dev_stage_name = paste(dev_stage_name, replicate, sep = "_"))

labels =coldata_ts_merged %>% pull(dev_stage_name)
color=coldata_ts_merged %>% pull(condition) #time
pca.plot_both <- ggplot(pca.res_both) + 
  aes(x=PC1, y=PC2, 
      label= labels, 
      color = color) +
  geom_point(size=4) +
  scale_color_viridis_d(option = "C") +
  # scale_color_manual(values = ciona_stages_palette) +
  xlab(paste0("PC1 (",pc.per_both[1],"%",")")) +
  ylab(paste0("PC2 (",pc.per_both[2],"%",")")) +
  labs(color = "Users By labs")+
  coord_fixed() +
  labs(color = "hpf") + 
  theme_personalised +
  theme(legend.position = c(0.5,0.1),
        legend.direction = "horizontal")

pca.plot_both

ggplotly(pca.plot_both) 
color=coldata_ts_merged %>% pull(time_series) 

pca.plot_bothPaper <- ggplot(pca.res_both) + 
  aes(x=PC1, y=PC2, 
      label= labels, 
      color = color) +
  geom_point(size=4) +
  scale_color_viridis_d(option = "C") +
  # scale_color_manual(values = ciona_stages_palette) +
  xlab(paste0("PC1 (",pc.per_both[1],"%",")")) +
  ylab(paste0("PC2 (",pc.per_both[2],"%",")")) +
  labs(color = "Users By labs")+
  coord_fixed() +
  labs(color = "hpf") + 
  theme_personalised +
  theme(legend.position = c(0.5,0.1),
        legend.direction = "horizontal")

pca.plot_bothPaper

ggplotly(pca.plot_bothPaper) 
cairo_pdf("frog_rna_both_pca.pdf", width=8.5, height=5.6)
print(pca.plot_bothPaper)
invisible(dev.off())

1.3.5 Correlation

###############################
#### Heatmap correlation ts1
###############################

### Compute pairwise correlation values

stages_ts1= TPM_txi_ts1.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>% pull(stage) |> unique()

log2.ts1.df <-
  TPM_txi_ts1.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
  mutate(expression = log2(expression + 1)) %>%
  mutate(stage = factor(stage,stages_ts1)) |> 
  pivot_wider(names_from = stage,values_from = expression) 

###############################
#### with average  biological replicates
###############################


stages_ts1_mean= coldata_ts_gene_evo %>% pull(short_stage) |> unique()
ts1_meanRep.df <- mean_by_repStage.fn(TPM_txi_ts1.df,stages_ts1_mean) 

# TPM meanRep filt log2
log2.ts1_meanRep.df_meanRep.df <-
  ts1_meanRep.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
  mutate(expression = log2(expression + 1)) %>%
  mutate(stage = factor(stage,stages_ts1_mean)) |> 
  pivot_wider(names_from = stage,values_from = expression)

### Compute pairwise correlation values and then order matrix
TPM_ts1_cor_spearman <- cor(log2.ts1.df[,-1],method = "spearman")
TPM_ts1_cor_spearman <- TPM_ts1_cor_spearman[stages_ts1,(stages_ts1)]

TPM_ts1_mean_cor_spearman <- cor(log2.ts1_meanRep.df_meanRep.df[,-1],method = "spearman")
TPM_ts1_mean_cor_spearman <- TPM_ts1_mean_cor_spearman[stages_ts1_mean,(stages_ts1_mean)]

heatmap_TPM_ts1_spearman <- heatmap_function(TPM_ts1_cor_spearman,"TPM ts1","Spearman")
heatmap_TPM_ts1_spearman

heatmap_TPM_ts1mean_spearman <- heatmap_function(TPM_ts1_mean_cor_spearman,"TPM","Spearman")
heatmap_TPM_ts1mean_spearman

#__________________________________________________________________________

###############################
#### Heatmap correlation ts2
###############################

### Compute pairwise correlation values
stages_ts2= TPM_txi_ts2.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>% pull(stage) |> unique()

log2.ts2.df <-
  TPM_txi_ts2.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
  mutate(expression = log2(expression + 1)) %>%
  mutate(stage = factor(stage,stages_ts2)) |> 
  pivot_wider(names_from = stage,values_from = expression) 

###############################
#### with average  biological replicates
###############################
stages_ts2_mean = coldata_ts_expande %>% pull(short_stage) |> unique()
ts2_meanRep.df <- mean_by_repStage.fn(TPM_txi_ts2.df,stages_ts2_mean) 

# TPM meanRep filt log2
log2.ts2_meanRep.df_meanRep.df <-
  ts2_meanRep.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
  mutate(expression = log2(expression + 1)) %>%
  mutate(stage = factor(stage,stages_ts2_mean)) |> 
  pivot_wider(names_from = stage,values_from = expression)

### Compute pairwise correlation values and then order matrix
TPM_ts2_cor_spearman <- cor(log2.ts2.df[,-1],method = "spearman")
TPM_ts2_cor_spearman <- TPM_ts2_cor_spearman[stages_ts2,(stages_ts2)]

TPM_ts2_mean_cor_spearman <- cor(log2.ts2_meanRep.df_meanRep.df[,-1],method = "spearman")
TPM_ts2_mean_cor_spearman <- TPM_ts2_mean_cor_spearman[stages_ts2_mean,(stages_ts2_mean)]

heatmap_TPM_ts2_spearman <- heatmap_function(TPM_ts2_cor_spearman,"TPM ts2","Spearman")
heatmap_TPM_ts2_spearman

heatmap_TPM_ts2mean_spearman <- heatmap_function(TPM_ts2_mean_cor_spearman,"TPM","Spearman")
heatmap_TPM_ts2mean_spearman

#__________________________________________________________________________
###############################
#### Heatmap correlation ts both
###############################

### Compute pairwise correlation values

stages_both= TPM_txi_both.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>% pull(stage) |> unique()

log2.both.df <-
  TPM_txi_both.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
  mutate(expression = log2(expression + 1)) %>%
  mutate(stage = factor(stage,stages_both)) |> 
  pivot_wider(names_from = stage,values_from = expression) 

###############################
#### with average  biological replicates
###############################

stages_both_mean= coldata_ts_merged %>% pull(short_stage) |> unique()
# stages_both_mean= TPM_txi_both.df |> pivot_longer(-gene_id, names_to = "stage") %>%
#       separate(stage, into = c("stage", "rep"), sep = "_") |> pull("stage")

both_meanRep.df <- mean_by_repStage.fn(TPM_txi_both.df,stages_both_mean) 

# TPM meanRep filt log2
log2.both_meanRep.df_meanRep.df <-
  both_meanRep.df %>%  pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
  mutate(expression = log2(expression + 1)) %>%
  mutate(stage = factor(stage,stages_both_mean)) |> 
  pivot_wider(names_from = stage,values_from = expression)

### Compute pairwise correlation values and then order matrix
TPM_both_cor_spearman <- cor(log2.both.df[,-1],method = "spearman")
TPM_both_cor_spearman <- TPM_both_cor_spearman[stages_both,(stages_both)]

TPM_both_mean_cor_spearman <- cor(log2.both_meanRep.df_meanRep.df[,-1],method = "spearman")
TPM_both_mean_cor_spearman <- TPM_both_mean_cor_spearman[stages_both_mean,(stages_both_mean)]

heatmap_TPM_both_spearman <- heatmap_function(TPM_both_cor_spearman,"TPM both","Spearman")
heatmap_TPM_both_spearman

heatmap_TPM_both_mean_spearman <- heatmap_function(TPM_both_mean_cor_spearman,"TPM","Spearman")
heatmap_TPM_both_mean_spearman

#__________________________________________________________________________

cairo_pdf("rna_ts1Mean_spearman.pdf", width=8.5, height=8.5)
print(draw(heatmap_TPMts1_mean_spearman, 
     column_title = "Spearman", 
     column_title_gp = gpar(fontsize = 16),
     heatmap_legend_side = "bottom", 
     annotation_legend_side = "bottom") )
Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'draw': object 'heatmap_TPMts1_mean_spearman' not found
invisible(dev.off())

cairo_pdf("rna_ts2Mean_spearman.pdf", width=8.5, height=8.5)
print(draw(heatmap_TPMts2_mean_spearman, 
     column_title = "Spearman",
     column_title_gp = gpar(fontsize = 16),
     heatmap_legend_side = "bottom", 
     annotation_legend_side = "bottom") )
Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'draw': object 'heatmap_TPMts2_mean_spearman' not found
invisible(dev.off())

cairo_pdf("rna_tsbothMean_spearman.pdf", width=8.5, height=8.5)
print(draw(heatmap_TPM_both_mean_spearman, 
     column_title = "Spearman", 
     column_title_gp = gpar(fontsize = 16),
     heatmap_legend_side = "bottom", 
     annotation_legend_side = "bottom"))
invisible(dev.off())